PCA Tutorial
A tutorial on how to run principal component analyses and heirarchical clustering in R
Introduction
Principal component analysis (PCA) is a statistical process commonly used for dimensionality reduction by projecting each data point onto only the first few principal components to obtain lower-dimensional data while preserving as much of the data’s variation as possible. The first principal component can equivalently be defined as a direction that maximizes the variance of the projected data.
For more info: https://en.wikipedia.org/wiki/Principal_component_analysis
# devtools::install_github("derekmichaelwright/agData")
library(agData)
library(scales) # rescale()
library(GGally) # ggpairs()
library(plotly) # plot_ly()
library(rworldmap) # mapPies()
library(FactoMineR) # PCA() & HCPC()
library(htmlwidgets) # saveWidget()Iris Data Set
Data Source
Fisher, R. A. (1936) The use of multiple measurements in taxonomic problems. Annals of Eugenics. 7(II): 179–188.
Anderson, Edgar (1935) The irises of the Gaspe Peninsula. Bulletin of the American Iris Society. 59: 2–5.
# print the data
DT::datatable(iris)Correlations
# Plot
mp <- ggpairs(iris, columns = 1:4, aes(color = Species)) +
scale_color_manual(values = alpha(agData_Colors,0.7)) +
scale_fill_manual(values = alpha(agData_Colors,0.7)) +
theme_agData() +
labs(title = "Iris Data Set",
caption = "\u00A9 derekmichaelwright.github.io/dblogr/ | Data: Anderson (1935) & Fisher (1936)")
ggsave("pca_tutorial_1_01.png", mp, width = 8, height = 6)PCA
# PCA
mypca <- PCA(iris[,1:4], graph = F)
# HPCA
myhpca <- HCPC(mypca, nb.clust = 3, graph = F)
#
pcs <- mypca[[3]]$coord %>% as.data.frame()
xx <- iris %>%
mutate(Cluster = myhpca[[1]]$clust) %>%
bind_cols(pcs)
#
colors <- c("darkslategrey", "darkred", "darkgreen")
shapes <- c(16, 13, 15)
# Plot
mp <- ggplot(xx, aes(x = Dim.1, y = Dim.2, color = Cluster, pch = Species)) +
geom_point(size = 2, alpha = 0.8) +
scale_color_manual(values = colors) +
scale_shape_manual(values = shapes) +
theme_agData() +
labs(title = "Iris Data Set", x = "PC 1", y = "PC 2",
caption = "\u00A9 derekmichaelwright.github.io/dblogr/ | Data: Anderson (1935)")
ggsave("pca_tutorial_1_02.png", mp, width = 6, height = 4)3D Plots
Species
https://dblogr.com/academic/pca_tutorial/pca_tutorial_iris_1.html
mp <- plot_ly(iris, x = ~Sepal.Length, y = ~Sepal.Width, z = ~Petal.Length, text = ~Species,
color = ~Species, colors = colors,
symbol = ~Species, symbols = shapes) %>%
add_markers()
htmlwidgets::saveWidget(mp, file="pca_tutorial_iris_1.html")PCA
https://dblogr.com/academic/pca_tutorial/pca_tutorial_iris_2.html
mp <- plot_ly(xx, x = ~Dim.1, y = ~Dim.2, z = ~Dim.3, text = ~Species,
color = ~Cluster, colors = colors,
symbol = ~Species, symbols = shapes) %>%
add_markers()
htmlwidgets::saveWidget(mp, file="pca_tutorial_iris_2.html")LDP Data Set
Data Source
Wright, D.M., Neupane, S., Heidecker, T., Haile, T.A., Chan, C., Coyne, C.J., McGee, R.J., Udupa, S., Henkrar, F., Barilli, E., Rubiales, D., Gioia, T., Logozzo, G., Marzario, S., Mehra, R., Sarker, A., Dhakal, R., Anwar, B., Sarker, D., Vandenberg A. & Bett K.E. (2020) Understanding photothermal interactions can help expand production range and increase genetic diversity of lentil (Lens culinaris Medik.). Plants, People, Planet. 00: 1-11. https://nph.onlinelibrary.wiley.com/doi/10.1002/ppp3.10158
Raw Data
# Prep data
myLDP <- read.csv("data_ldp.csv")
myD <- read.csv("data_phenology.csv")
myC <- read.csv("data_countries.csv")
myExpts <- c("Ro16", "Ro17", "Su16", "Su17", "Su18", "Us18",
"In16", "In17", "Ba16", "Ba17", "Ne16", "Ne17",
"Sp16", "Sp17", "Mo16", "Mo17", "It16", "It17" )
myColors <- c("darkred", "darkorange3", "darkgoldenrod2", "deeppink3",
"steelblue", "darkorchid4", "cornsilk4", "darkgreen")
xx <- myD %>%
mutate(Expt = factor(Expt, levels = myExpts)) %>%
group_by(Expt) %>%
mutate(DTF_Scaled = rescale(DTF, c(1,5))) %>%
ungroup() %>%
gather(Trait, Value, DTF, DTF_Scaled)
mp <- ggplot(xx, aes(x = Expt, y = Value)) +
geom_quasirandom(size = 0.2, alpha = 0.5) +
facet_grid(Trait ~ ., scales = "free_y") +
theme_agData() +
labs(y = NULL, x = NULL)
ggsave("pca_tutorial_2_01.png", mp, width = 8, height = 6)PCA
# Prep data
xx <- read.csv("data_phenology.csv") %>%
mutate(Expt = factor(Expt, levels = myExpts),
DTF = round(DTF)) %>%
select(Entry, Expt, DTF) %>%
spread(Expt, DTF) %>%
column_to_rownames("Entry")
DT::datatable(xx)# PCA
mypca <- PCA(xx, ncp = 10, graph = F)
# Heirarcical clustering
mypcaH <- HCPC(mypca, nb.clust = 8, graph = F)
# Bind together
x1 <- mypcaH[[1]] %>% rownames_to_column("Entry")
x2 <- mypca[[3]]$coord %>% as.data.frame() %>% rownames_to_column("Entry")
myPC <- left_join(x1, x2, by = "Entry") %>%
mutate(Entry = as.numeric(Entry)) %>%
left_join(select(myLDP, Entry, Name, Origin), by = "Entry") %>%
left_join(select(myC, Origin=Country, Region), by = "Origin") %>%
select(Entry, Name, Origin, Region, Cluster=clust,
PC1=Dim.1, PC2=Dim.2, PC3=Dim.3, PC4=Dim.4, PC5=Dim.5,
PC6=Dim.6, PC7=Dim.7, PC8=Dim.8, PC9=Dim.9, PC10=Dim.10)
# Print PC's
DT::datatable(myPC)PCA Plots
# Plot
mp <- ggplot(myPC, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point() +
scale_color_manual(values = myColors) +
theme_agData(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank())
ggsave("pca_tutorial_2_02.png", mp, width = 6, height = 4)https://dblogr.com/academic/pca_tutorial/pca_tutorial_ldp_1.html
mp <- plot_ly(myPC, x = ~PC1, y = ~PC2*2.5, z = ~PC3*2.5,
text = ~paste(Entry, "|", Name, "\nOrigin:", Origin),
color = ~Cluster, colors = myColors) %>%
add_markers()
htmlwidgets::saveWidget(mp, file="pca_tutorial_ldp_1.html")Pc1 vs PC2 vs PC3
# Prep data
perc <- round(mypca[[1]][,2], 1)
# Plot PCA 1v2
find_hull <- function(df) df[chull(df[,"PC1"], df[,"PC2"]), ]
polys <- plyr::ddply(myPC, "Cluster", find_hull) %>% mutate(Cluster = factor(Cluster))
mp1 <- ggplot(myPC) +
geom_polygon(data = polys, alpha = 0.15, aes(x = PC1, y = PC2, fill = Cluster)) +
geom_point(aes(x = PC1, y = PC2, colour = Cluster)) +
scale_fill_manual(values = myColors) +
scale_color_manual(values = myColors) +
theme_agData() +
theme(legend.position = "none", panel.grid = element_blank()) +
labs(x = paste0("PC1 (", perc[1], "%)"),
y = paste0("PC2 (", perc[2], "%)"))
# Plot PCA 1v3
find_hull <- function(df) df[chull(df[,"PC1"], df[,"PC3"]), ]
polys <- plyr::ddply(myPC, "Cluster", find_hull) %>% mutate(Cluster = factor(Cluster))
mp2 <- ggplot(myPC) +
geom_polygon(data = polys, alpha = 0.15, aes(x = PC1, y = PC3, fill = Cluster)) +
geom_point(aes(x = PC1, y = PC3, colour = Cluster)) +
scale_fill_manual(values = myColors) +
scale_color_manual(values = myColors) +
theme_agData() +
theme(legend.position = "none", panel.grid = element_blank()) +
labs(x = paste0("PC1 (", perc[1], "%)"),
y = paste0("PC3 (", perc[3], "%)"))
# Plot PCA 2v3
find_hull <- function(df) df[chull(df[,"PC2"], df[,"PC3"]), ]
polys <- plyr::ddply(myPC, "Cluster", find_hull) %>% mutate(Cluster = factor(Cluster))
mp3 <- ggplot(myPC) +
geom_polygon(data = polys, alpha = 0.15, aes(x = PC2, y = PC3, fill = Cluster)) +
geom_point(aes(x = PC2, y = PC3, colour = Cluster)) +
scale_fill_manual(values = myColors) +
scale_color_manual(values = myColors) +
theme_agData() +
theme(legend.position = "none", panel.grid = element_blank()) +
labs(x = paste0("PC2 (", perc[2], "%)"),
y = paste0("PC3 (", perc[3], "%)"))
# Append
mp <- ggarrange(mp1, mp2, mp3, nrow = 1, ncol = 3, hjust = 0)
ggsave("pca_tutorial_2_03.png", mp, width = 8, height = 2.5)Phenotypes by Cluster Group
# Prep data
xx <- read.csv("data_phenology.csv") %>%
mutate(Expt = factor(Expt, levels = myExpts)) %>%
group_by(Expt) %>%
mutate(DTF_Scaled = scales::rescale(DTF, c(1,5)),
DTF_Scaled = round(DTF_Scaled,1)) %>%
ungroup() %>%
select(Entry, Expt, DTF_Scaled) %>%
left_join(select(myPC, Entry, Cluster), by = "Entry") %>%
group_by(Expt, Cluster) %>%
summarise(mean = mean(DTF_Scaled, na.rm = T), sd = sd(DTF_Scaled, na.rm = T)) %>%
ungroup()
# Plot
mp <- ggplot(xx, aes(x = Expt, y = mean, group = Cluster)) +
geom_point(aes(color = Cluster)) +
geom_vline(xintercept = 6.5, lty = 2) +
geom_vline(xintercept = 12.5, lty = 2) +
geom_ribbon(aes(ymin = mean - sd, ymax = mean + sd, fill = Cluster),
alpha = 0.2, color = NA) +
geom_line(aes(color = Cluster), size = 1) +
scale_color_manual(values = myColors) +
scale_fill_manual(values = myColors) +
theme_agData() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "none", panel.grid = element_blank()) +
labs(y = "DTF (scaled 1-5)", x = NULL)
#
ggsave("pca_tutorial_2_04.png", mp, width = 8, height =3.5)Pie Map
# Plot
xx <- myPC %>%
filter(!Origin %in% c("ICARDA","USDA","Unknown")) %>%
group_by(Origin, Cluster) %>%
summarise(Count = n()) %>%
spread(Cluster, Count) %>%
left_join(select(myC, Origin=Country, Lat, Lon), by = "Origin") %>%
ungroup() %>%
as.data.frame()
xx[is.na(xx)] <- 0
png("pca_tutorial_2_05.png", width = 4800, height = 2200, res = 600)
par(mai = c(0,0,0,0), xaxs = "i", yaxs = "i")
mapPies(dF = xx, nameX = "Lon", nameY = "Lat", zColours = myColors,
nameZs = c("1","2","3","4","5","6","7","8"), lwd = 1,
xlim = c(-140,110), ylim = c(0,20), addCatLegend = F,
oceanCol = "grey90", landCol = "white", borderCol = "black") ## symbolMaxSize= 5 maxSumValues= 49 symbolScale= 0.7142857
## List of 2
## $ x: num [1:100] -125 -125 -125 -125 -125 ...
## $ y: num [1:100] 57.3 57.6 57.9 58.2 58.5 ...
legend(-139.5, 15.5, title = "Cluster", legend = 1:8, col = myColors,
pch = 16, cex = 1, pt.cex = 2, box.lwd = 2)
dev.off()## png
## 2
© Derek Michael Wright